home *** CD-ROM | disk | FTP | other *** search
- // CMU96
- // .html to .txt converter.
-
- // Supplied 'as is'
- // NO WARRENTY INCLUDED!
- // If this code doesn't do what you expect, sorry, but it isn't my fault!
- // It isn't likley to be damaging but there is a small chance I've overlooked
- // something. It works for me though.
-
- // Compiled with SAS/C compiler.
- // I can't garantee this compiling with anything else but it should be about
- // 95% portable across Amiga, PC and Unix systems.
-
- /*
- ** One day, it might...
- **
- ** remove all unknown tags.
- ** cat from and to files.
- ** have command line switches and intelligence.
- ** do clever stuff with word wrap and line wrap etc.
- ** have adjustable margins/tab spaces etc.
- ** recognise <br> <p>PARAGRAPH</p> <a href>INVERT</a> <center>CENTER</center>
- ** <h?>PARAGRAPH</h?> <blink>UNDERLINE</blink> <title>TITLE\n</title>
- ** <pre>PREFORMATTED</pre> <!-- Commemt <br> stuff -->
- */
-
- // But I doubt it...
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #define width 77
-
- void getword(char *word,FILE *in);
- int preprocess(char *word,FILE *in,FILE *out,int x,int pure);
- int tag(char *word);
- int tagis(char *word,char *check);
-
- main(int argc,char *argv[])
- {
- FILE *in,*out;
- char word[width];
- int argin=1,argout=2,pure=0;
-
- int x=0; // THE X Value (distance from left edge of line in characters)
-
- // Check for correct arguements. If bad, report usage
- if ((argc<2) || (argc>4) || (strcmp(argv[1],"?")==0))
- {
- printf("Usage: %s (-h|-help)|([-p|-pure] infile.html [outfile.txt])\n"
- ,argv[0]);
- exit(0);
- }
-
- // Check for -help flag
- if ((strcmp(argv[1],"-help")==0) || (strcmp(argv[1],"-h")==0))
- {
- printf("\n");
- printf("%s by Chris Underwood. Compiled with SAS/C. CMU",argv[0]);
- puts("\n");
- puts("Converts .html files into a readable form for displaying in a text");
- puts("only shell. An outfile may be specified or the produced text can be");
- puts("redirected or piped.");
- puts("Options:");
- puts(" -h -help\tPrints this text");
- puts(" -p -pure\tSurpress any non-ascii output (inverse text etc)");
- puts(" infile\t\tSource html file (.html)");
- puts(" outfile\tDestination file (.txt)");
- puts("If no outfile is specified then text output will be sent to stdout.");
- puts("If outfile exists it will be overwritten.");
- puts("This program is supplied \'as is\' and has absolutly no warrenty");
- puts("whatsoever. If it (unlikely) screws up your HD, it \'aint my fault!");
- printf("\n");
- exit(0);
- }
-
- // Check for pure flag, and adjust where to expect *in and *out in the
- // arguement list.
- if ((strcmp(argv[1],"-p")==0) || (strcmp(argv[1],"-pure")==0))
- {
- argin=2; // Set in and out args
- argout=3;
- pure=1; // Set pure flag
- }
-
- // Open infile and check to see if it exists. Give error if necessary.
- if ((in=fopen(argv[argin],"r"))==NULL)
- {
- fprintf(stderr,"%s: Failed to open file %s for reading\n",
- argv[0],argv[argin]);
- exit(1);
- }
-
- // Similar check and open outfile.
- if (argc==argout+1)
- {
- if ((out=fopen(argv[argout],"w"))==NULL)
- {
- fprintf(stderr,"%s: Failed to open file %s for writing\n",
- argv[0],argv[argout]);
- exit(1);
- }
- } else {
- // Set out to be pointing at stdout. Somehow, this actually works!
- out=stdout;
- }
-
- // Main processing loop
- while (!feof(in)) // Better check for eof alot...
- {
- getword(word,in);
- if (tag(word)) x=preprocess(word,in,out,x,pure); else
- {
- if (x+strlen(word)+1<width) // If word fits on line...
- {
- x+=strlen(word)+1; // Update the X value
- fprintf(out,"%s ",word); // And print the word (with space)
- }
- else
- {
- x=strlen(word)+1; // Reset the X value
- fprintf(out,"\n%s ",word); // And print the word on a new line
- }
- }
- }
-
- // Cleanup, close files then exit.
- if (!pure)
- {
- fprintf(out,"%c0m\n",155); // Print a newline and set normal text
- }
- else
- {
- fprintf(out,"\n"); // Shouldn't set normal text here - code is impure
- }
- fclose(in);
- fclose(out);
- exit(0); // Successful conversion from html to txt!
- }
-
- void getword(char *word,FILE *in)
- {
- int j,k,ch;
- j=0;
- for (k=0; k<width; k++) word[k]='\0'; // Like strnset(), except this works.
- word[0]=getc(in);
- // Remove leading spaces
- while (((word[0]==' ') || (word[0]=='\n') || (word[0]=='\t')) && !feof(in))
- word[0]=getc(in);
- if (word[0]=='<') // Tag found.
- {
- k=1; // Start paranthesis count
- while (!feof(in)) // Better be safe now...
- {
- j++;
- if (j==width) j--; // Simple fix suggested by Xav (cheers).
- // Not very neat, but it works since only comment
- // tags and bad-html have tags this long
- word[j]=getc(in);
- if (word[j]=='<') k++; // Like a stack, but with no data!
- if (word[j]=='>') k--;
- if (k==0) break; // Angle brackets are now matched
- }
- }
- else // Not a tag - a normal word.
- {
- // Get the word
- while (!feof(in)) // While not eof
- {
- ch=getc(in);
- if (ch==' ') break; // And not a space...
- if (ch=='\n') break; // And not a newline...
- if (ch=='\t') break; // And not a tab charecter...
- if (j==width) break; // And not the max width of a word...
- if (ch=='<')
- {
- ungetc(ch,in);
- break;
- }
- j++;
- word[j]=ch; // Attach ch to the end of word[]
- }
- if (feof(in)) word[0]='\0';
- }
- }
-
- int preprocess(char *word,FILE *in,FILE *out,int x,int pure)
- {
- char smallbuffer[7]; // Used to detect </pre> tag
- int j;
-
- if (tagis(word,"br") || tagis(word,"h") || tagis(word,"p") ||
- tagis(word,"/h") || tagis(word,"/p"))
- {
- fprintf(out,"\n"); // Print a newline
- x=0; // Set the X value to the start of the line
- }
-
- if (!pure) // If we are not giving pure output...
- {
- if (tagis(word,"title") || tagis(word,"a") || tagis(word,"blink"))
- {
- fprintf(out,"%c7m",155); // Set inverted text
- }
-
- if (tagis(word,"/title") || tagis(word,"/a") || tagis(word,"/blink"))
- {
- fprintf(out,"%c0m",155); // Set normal text and print a newline
- fprintf(out,"\b ");
- }
- }
-
- if (tagis(word,"/title")) // We need a newline...
- {
- x=0;
- fprintf(out,"\n");
- }
-
- if (tagis(word,"pre")) // Sort this here. Should be a function but I'm lazy!
- {
- smallbuffer[6]='\0'; // So that strcmp() works.
- printf("\n");
- x=0; // After the </pre> tag we will print a newline
- for (j=0; (j<6) && !feof(in); j++) // Fill the small buffer
- {
- smallbuffer[j]=fgetc(in);
- }
- while(!feof(in) && !(strcmp(smallbuffer,"</pre>")==0))
- {
- //fprintf(stderr,"%s\n",smallbuffer);
- // Print first buffer char
- fprintf(out,"%c",smallbuffer[0]);
- // Shuffle buffer left
- for (j=1; j<6; j++) smallbuffer[j-1]=smallbuffer[j];
- // Get a new char
- smallbuffer[5]=fgetc(in);
- }
- // Exit conditions checked by while loop
- // No need to dump the small buffer because it only contains </pre> anyway
- printf("\n");
- }
-
- return x;
- }
-
- int tagis(char *word,char *check)
- {
- int same=1,j;
- for (j=0; j<strlen(check); j++) // Walk through check string
- {
- if (word[j+1] != check[j]) // If we have unequal charecters...
- {
- same=0; // State the inequality
- break; // Not much point in continuing now...
- }
- }
- return same; // And send back the result
- }
-
- int tag(char *word)
- {
- if (word[0]=='<') return 1; else return 0;
- }
-